Open Source in Environmental Sustainability
Contents
Open Source in Environmental Sustainability#
from IPython.display import display, HTML
import dateparser
import datetime
import handcalcs.render
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
import pycountry
from pycountry_convert import country_alpha2_to_continent_code, country_alpha3_to_country_alpha2
# Clean up the dataset
def name_to_iso3(x):
"""Perform a fuzzy search for UK-like strings
Arguments:
x - a string with a country name
Outputs:
A string with ISO3 name standard for the UK
"""
if x == "UK":
x = "United Kingdom"
try:
iso3 = pycountry.countries.search_fuzzy(x)[0].alpha_3
except:
iso3 = ""
return iso3
def alpha3_to_alpha2(x):
"""Convert country code ISO 3166-1 alpha-3 to country code ISO 3166-1 alpha-2 .
Arguments:
x - a string with a country name following ISO 3166-1 alpha-3 standard
Outputs:
A string with a country name following country code ISO 3166-1 alpha-2
"""
try:
alpha_2 = country_alpha3_to_country_alpha2(x)
except:
alpha_2 = ""
return alpha_2
def alpha2_to_continent(x):
"""Convert country code ISO 3166-1 alpha-2 to continent name
Arguments:
x - a string with a country name following ISO 3166-1 alpha-2 standard
Outputs:
A string with a continent name
"""
try:
continent = country_alpha2_to_continent_code(x)
except:
continent = ""
return continent
def upper_string(lower_string):
"""Apply title format
Arguments:
lower_string - a string
Outputs:
A string with a title format
"""
return lower_string.title()
def calc_age(start_date):
"""Calculate age in years between now and start_date
Arguments:
start_date - a date
Outputs:
A float with number of years between now and start_date
"""
return (datetime.datetime.now() - dateparser.parse(start_date, settings={'TIMEZONE': 'CEST'})).days/365
def count_strings(comma_seperated_string):
"""Count number of delimiters (commas) in a string
Arguments:
comma_seperated_string - a string containing commas
Outputs:
A number (int) of commas found in comma_seperated_string
"""
if type(comma_seperated_string) == str:
return comma_seperated_string.count(",")
else:
return 0
Set Default Plotting Options#
# default plotting options
# Palette https://coolors.co/palette/0e7c7b-17bebb-ffc857-e9724c-c5283d
height = (800,) # Added parameter
color_continuous_scale = px.colors.sequential.Aggrnyl[::-1]
marker_color = "#0E7C7B"
color_discrete_sequence = ["#0E7C7B", "#17BEBB", "#FFC857", "#E9724C", "#C5283D"]
# Register your theme as a named template
pio.templates["OpenSustain"] = go.layout.Template(
layout=dict(
font=dict(
family="Google Font",
color="#040404",
size=15,
),
title_font_family="Google Font",
title_font_color="#040404",
),
)
# Combine your theme with plotly's default
pio.templates.default = "plotly+OpenSustain"
df_raw = pd.read_csv("./csv/projects.csv")
df_raw.rename(columns={"rubric": "topic"},inplace=True)
df_raw.rename(columns={"topics": "labels"},inplace=True)
df_raw.head(5)
| project_name | oneliner | git_namespace | git_url | platform | labels | topic | last_commit_date | stargazers_count | number_of_dependents | ... | organization_name | organization_github_url | organization_website | organization_location | organization_country | organization_form | organization_avatar | organization_public_repos | organization_created | organization_last_update | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | pvlib-python | A set of documented functions for simulating t... | pvlib | https://github.com/pvlib/pvlib-python.git | github | solar-energy,python,renewable-energy,renewable... | Photovoltaics and Solar Energy | 2022/08/31, 04:48:09 | 728.0 | 257.0 | ... | NaN | https://github.com/pvlib | NaN | NaN | NaN | NaN | https://avatars.githubusercontent.com/u/110372... | NaN | NaN | NaN |
| 1 | pvfactors | Open source view-factor model for diffuse shad... | SunPower | https://github.com/SunPower/pvfactors.git | github | solar-energy,renewable-energy,python,bifacial | Photovoltaics and Solar Energy | 2022/02/22, 21:53:32 | 62.0 | 7.0 | ... | NaN | https://github.com/SunPower | NaN | NaN | NaN | NaN | https://avatars.githubusercontent.com/u/134197... | NaN | NaN | NaN |
| 2 | gsee | Global Solar Energy Estimator. | renewables-ninja | https://github.com/renewables-ninja/gsee.git | github | solar,pandas,energy,irradiance,photovoltaic,pv... | Photovoltaics and Solar Energy | 2020/07/21, 06:28:35 | 88.0 | 0.0 | ... | NaN | https://github.com/renewables-ninja | https://www.renewables.ninja/ | NaN | NaN | NaN | https://avatars.githubusercontent.com/u/118382... | NaN | NaN | NaN |
| 3 | PVMismatch | An explicit Python PV system IV & PV curve tra... | SunPower | https://github.com/SunPower/PVMismatch.git | github | numpy,scipy,python,solar,photovoltaic | Photovoltaics and Solar Energy | 2022/04/14, 19:15:36 | 51.0 | 0.0 | ... | NaN | https://github.com/SunPower | NaN | NaN | NaN | NaN | https://avatars.githubusercontent.com/u/134197... | NaN | NaN | NaN |
| 4 | rdtools | An open source library to support reproducible... | NREL | https://github.com/NREL/rdtools.git | github | NaN | Photovoltaics and Solar Energy | 2022/09/02, 16:21:58 | 109.0 | 5.0 | ... | NaN | https://github.com/NREL | http://www.nrel.gov | Golden, CO | NaN | NaN | https://avatars.githubusercontent.com/u/190680... | NaN | NaN | NaN |
5 rows × 51 columns
Calculate Age in Years#
# Age plots are better in years
df_raw["project_age_in_years"] = df_raw["project_age_in_days"].apply(lambda x: x / 365)
max_age_in_years = 8.0
Basis Statistics#
First let us get a routh overview of the project dataset
fig = go.Figure(
data=[
go.Table(
header=dict(values=["Dimension", "Value"],line_color='#000000',
fill_color='#ffffff', font_size=18 , ),
cells=dict(
fill_color='#ffffff',
line_color='#ffffff',
font_size=16,
height=30,
values=[
[
"Total number of projects",
"Github projects",
"Gitlab projects",
"Other platforms",
"Number of projects in personal namespace",
"Total stars of all projects",
"Total contributers of all projects",
"Active GitHub projects",
"Inactive GitHub projects",
"Projects with contribution guide in %",
"Projects with code of conduct in %",
"Projects accepting donations in %",
"Median number of commits",
"Median stargazers",
"Median stars last year",
"Median Development Distribution Score",
"Median number of contributors",
"Median closed issues last year",
"Median commits last year",
"Median age in years",
],
[
df_raw["project_name"].count(),
df_raw["platform"].value_counts()["github"],
df_raw["platform"].value_counts()["gitlab"],
df_raw["platform"].value_counts()["custom"],
df_raw["project_name"].count() - df_raw["organization"].count(),
df_raw["stargazers_count"].sum(),
df_raw["contributors"].sum(),
df_raw["project_active"].value_counts()[True],
df_raw["project_active"].value_counts()[False],
round(df_raw["contribution_guide"].value_counts(normalize=True)[True]*100,2),
round(df_raw["code_of_conduct"].value_counts(normalize=True)[True]*100,2),
round(df_raw["accepts_donations"].value_counts(normalize=True)[True]*100,2),
df_raw["total_number_of_commits"].median(),
df_raw["stargazers_count"].median(),
df_raw["stars_last_year"].median(),
round(df_raw["development_distribution_score"].median(),4),
df_raw["contributors"].median(),
df_raw["issues_closed_last_year"].median(),
df_raw["total_commits_last_year"].median(),
round(df_raw["project_age_in_years"].median(),2),
],
]
),
)
]
)
fig.update_layout(
height=1000,
width=1000
)
fig.show()
Development Distribution Score#
The Development Distribution Score (DDS) weights how the development is distributed between projects contributors by setting contributor with the most commits in relation with the other contributors. Distribution of knowledge, work, and governance of an project ensure sustainability. When people are leaving a project or don’t find time anymore for an open source project other can still continue and jump into leading positions.
DDS is created in the preprocessing script and is similar to the bus factor. It is only based on quantiative values derived from git statistics. This value is calculated in preprocessing.
Filter Data#
df_active = df_raw.copy()
# Filter out the inactive project for further analysis
df_active = df_active[(df_active["project_active"] == True)]
# Ciruated Lists are no classical open source projects and are not included into the analysis
df_active = df_active[(df_active["topic"] != "Curated Lists")]
# Filter out the projects not on the GitHub platform
df_active = df_active[(df_active["platform"] == "github")]
Score Projects#
# Calculate the scores on activity, community and size
df_active["activity"] = (
df_active["total_commits_last_year"].rank(pct=True)
+ df_active["issues_closed_last_year"].rank(pct=True)
+ df_active["days_until_last_issue_closed"].rank(pct=True)
+ df_active["last_released_date"].rank(pct=True, na_option="top")
)
df_active["community"] = (
df_active["contributors"].rank(pct=True)
+ df_active["development_distribution_score"].rank(pct=True)
+ df_active["reviews_per_pr"].rank(pct=True)
)
df_active["size"] = (
df_active["total_number_of_commits"].rank(pct=True)
+ df_active["contributors"].rank(pct=True)
+ df_active["closed_issues"].rank(pct=True)
+ df_active["closed_pullrequests"].rank(pct=True)
)
# All scores are weighted equal and normalized to one
df_active["total_score"] = (
df_active["activity"] / df_active["activity"].max()
+ df_active["community"] / df_active["community"].max()
+ df_active["size"] / df_active["size"].max()
) / 3
# Save the dataset with the scores
df_active_path = "./csv/project_analysis.csv"
df_active.to_csv(df_active_path)
%%render
## The calcluation within this cell shall reader give an understanding on how the DDS is been calculated.
## Values calculated here are not used in any other cell.
n_MaxCommitsSingleContributor = 90
n_total_commits = 100
DDS = 1 - n_MaxCommitsSingleContributor / n_total_commits
### KK: this is where a clear object naming convention + comments would really help: is syntax df[df_raw[..]] appropriate here?
### KK: it might be helpful to plot boxplots for the below scores per category to better show their distribution, including median
df_personal_projects = df_active[df_active["organization"].isna()]
df_organization_projects = df_active[df_active["organization"].notna()]
df_inactive = df_raw[(df_raw["project_active"] == False)]
df_top_stargazers = df_active[(df_active["stargazers_count"] > 100)]
fig = go.Figure(
data=[
go.Table(
header=dict(values=["Median DDS", "Value"],line_color='#000000',fill_color='#ffffff',font_size=18),
cells=dict(
line_color='#ffffff',fill_color='#ffffff', font_size=16, height =30,
values=[
[
"All projects",
"Active projects in personal namespace",
"Active organization projects",
"Active projects",
"Inactive projects",
"Active projects with more than 50 Stars",
],
[
round(df_raw["development_distribution_score"].median(),3),
round(df_personal_projects["development_distribution_score"].median(),3),
round(df_organization_projects["development_distribution_score"].median(),3),
round(df_active["development_distribution_score"].median(),3),
round(df_inactive["development_distribution_score"].median(),3),
round(df_top_stargazers["development_distribution_score"].median(),3),
],
]
),
)
]
)
fig.update_layout(
width=800
)
fig.show()
df_active.iloc[300]
project_name EVCC
oneliner An extensible EV Charge Controller with PV int...
git_namespace andig
git_url https://github.com/evcc-io/evcc.git
platform github
labels mqtt,golang,pv,wallbox,emobility,charger,wallb...
topic Mobility and Transportation
last_commit_date 2022/09/04, 20:03:13
stargazers_count 759.0
number_of_dependents 23.0
stars_last_year 492.0
project_active True
dominating_language Go
organization NaN
organization_user_name evcc-io
languages Go,Vue,JavaScript,Smarty,CSS,Shell,Makefile,Do...
homepage https://evcc.io
refs NaN
project_created 2019/12/06, 16:27:04
project_age_in_days 1003.0
license MIT
total_commits_last_year 1096.0
total_number_of_commits 2190.0
last_issue_closed 2022/09/04, 21:01:01
open_issues 39.0
closed_pullrequests 1394.0
closed_issues 2266.0
issues_closed_last_year 1257.0
days_until_last_issue_closed 0.0
open_pullrequests 23.0
reviews_per_pr 1.1
development_distribution_score 0.222484
last_released_date 2022/08/13, 11:31:37
last_release_tag_name 0.100
good_first_issue 0.0
contributors 57.0
accepts_donations True
donation_platforms github,patreon,open_collective,ko_fi,tidelift,...
code_of_conduct False
contribution_guide False
dependents_repos JanDragon/evcc,opensprinklershop/evcc,matspi/e...
organization_name NaN
organization_github_url https://github.com/evcc-io
organization_website https://evcc.io
organization_location Germany
organization_country NaN
organization_form NaN
organization_avatar https://avatars.githubusercontent.com/u/813835...
organization_public_repos NaN
organization_created NaN
organization_last_update NaN
project_age_in_years 2.747945
activity 2.871102
community 2.193867
size 3.682952
total_score 0.856184
Name: 406, dtype: object
Process Active GitHub Projects#
# Read the scored dataset and configure the plotting backend
df_active = pd.read_csv(df_active_path)
Start Plotting#
license_his = (
df_active["license"]
.value_counts()
.to_frame()
.rename_axis("license_names")
.reset_index()
)
fig = px.pie(license_his, values="license", names="license_names", color_discrete_sequence=color_discrete_sequence, hole=0.2)
fig.update_layout(title="Distribution of Licenses", showlegend=True, font_size=16)
fig.update_traces(textposition='inside', textinfo='percent+label', marker=dict(line=dict(color='#000000', width=1)))
fig.show()
# alternative to the pie chart in cell 23
# main point: ~80% of all open source licences fall under 5 types
main_license_types = ['BSD-3-Clause', 'MIT', 'GPL-3.0', 'CUSTOM', 'Apache-2.0']
alt_df_active = df_active.copy()
alt_df_active['pooled_license'] = np.where(
alt_df_active['license'].isin(main_license_types), alt_df_active['license'], 'Other')
alt_license_his = (
alt_df_active["pooled_license"]
.value_counts()
.to_frame()
.rename_axis("license_names")
.reset_index()
)
alt_fig = px.pie(alt_license_his, values="pooled_license", names="license_names", color_discrete_sequence=color_discrete_sequence, hole=0.2)
alt_fig.update_layout(title="Distribution of Licenses", showlegend=True, font_size=16)
alt_fig.update_traces(textposition='inside', textinfo='percent+label', marker=dict(line=dict(color='#000000', width=1)))
alt_fig.show()
fig = px.histogram(
df_active,
x="project_age_in_years",
nbins=50,
title="Distribution of Project Age in Years",
)
fig.update_layout(
yaxis_title="Projects",
xaxis_title="Project Age",
)
fig.update_traces(marker_color=marker_color)
fig.show()
fig = px.histogram(
df_active,
x="total_number_of_commits",
nbins=50,
title="Distribution of Total Commits",
)
fig.update_layout(
yaxis_title="Projects",
xaxis_title="Project Total Commits",
)
fig.update_traces(marker_color=marker_color)
fig.show()
## KK: is there a way of grouping these topics differently? The graph below is difficult to read. What's the story/message? That most topics have under 50 projects+show those that have more? Something else?
topic_his = (
df_active["topic"]
.value_counts()
.to_frame()
.rename_axis("topic_names")
.reset_index()
)
fig = px.pie(topic_his, values="topic", names="topic_names", color_discrete_sequence=color_discrete_sequence, hole=0.2)
fig.update_layout(title="Projects within Topics", height=1200, showlegend=False)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
## KK:Same as above: what's the message? How else can we group the data to get it across?
fig = px.pie(df_active.groupby('topic')['contributors'].sum().reset_index(), values="contributors", names="topic", color_discrete_sequence=color_discrete_sequence, hole=0.2)
fig.update_layout(title="Contributors within Topics", height=1200, showlegend=False)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
fig = px.pie(df_active.groupby('topic')['stargazers_count'].sum().reset_index(), values="stargazers_count", names="topic", color_discrete_sequence=color_discrete_sequence, hole=0.2)
fig.update_layout(title="Stars within topics", height=1000, showlegend=False)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
fig = px.pie(df_active.groupby('topic')['development_distribution_score'].median().reset_index(), values="development_distribution_score", names="topic", color_discrete_sequence=color_discrete_sequence, hole=0.2)
fig.update_layout(title="Median Development Distribution Score within topics", height=1000, showlegend=False)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
fig = px.pie(df_active.groupby('topic')['stars_last_year'].sum().reset_index(), values="stars_last_year", names="topic", color_discrete_sequence=color_discrete_sequence, hole=0.2)
fig.update_layout(title="Stars within Topics", height=1200, showlegend=False)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
topic_his = (
df_active["topic"]
.value_counts()
.to_frame()
.rename_axis("topic_names")
.reset_index()
)
fig = px.bar(
df_active.groupby('topic')['contributors'].sum().reset_index().sort_values('contributors',ascending=[False]),
x="contributors",
y="topic",
orientation="h",
)
fig.update_layout(
height=1000, # Added parameter
yaxis_title="Topics",
xaxis_title="Contributors",
title="Contributors within Topics",
coloraxis_colorbar=dict(
title="DDS",
),
hoverlabel=dict(
bgcolor="white"
)
)
fig.update_traces(marker_color=marker_color)
fig.update(layout_showlegend=False)
topic_his = (
df_active["topic"]
.value_counts()
.to_frame()
.rename_axis("topic_names")
.reset_index()
)
fig = px.bar(
topic_his,
x="topic",
y="topic_names",
orientation="h",
)
fig.update_layout(
height=1000, # Added parameter
yaxis_title="Topics",
xaxis_title="Projects",
title="Projects within Topics",
coloraxis_colorbar=dict(
title="DDS",
),
hoverlabel=dict(
bgcolor="white"
)
)
fig.update_traces(marker_color=marker_color)
fig.update(layout_showlegend=False)
license_dominating_language = (
df_active["dominating_language"]
.value_counts()
.to_frame()
.rename_axis("dominating_language_names")
.reset_index()
)
license_dominating_language
license_dominating_language = license_dominating_language[(license_dominating_language["dominating_language"] > 4)]
fig = px.pie(license_dominating_language, values="dominating_language", names="dominating_language_names", color_discrete_sequence=color_discrete_sequence, hole=0.2)
fig.update_layout(title="Distribution of Programming Languages", showlegend=True, font_size=16,height=800)
fig.update_traces(textposition='outside', textinfo='percent+label', marker=dict(line=dict(color='#000000', width=1)))
fig.show()
# KK I thing the question that we should be asking: are there similar patterns followed by most topics? If so, whatare they? If not, what are the fields that stand out and what is the difference?
# df_sorted = df.groupby(['topic'], as_index=False)['dominating_language'].agg('sum')
df_language_distribution = (
df_active.value_counts(["topic", "dominating_language"]).to_frame().reset_index()
)
df_language_distribution.rename(columns={0: "counts"}, inplace=True)
fig = px.scatter(
df_language_distribution, x="dominating_language", y="topic", size="counts",
)
fig.update_layout(
height=1000, # Added parameter
width=1200,
xaxis_title="Dominating Language",
yaxis_title="Topic",
)
fig.update_traces(marker_color=marker_color)
fig.show()
# KK I thing the question that we should be asking: are there similar patterns followed by most topics? If so, whatare they? If not, what are the fields that stand out and what is the difference?
# df_sorted = df.groupby(['topic'], as_index=False)['dominating_language'].agg('sum')
df_license_distribution = (
df_active.value_counts(["topic", "license"]).to_frame().reset_index()
)
df_license_distribution.rename(columns={0: "counts"}, inplace=True)
fig = px.scatter(df_license_distribution, x="license", y="topic", size="counts")
fig.update_layout(
height=1000, # Added parameter
xaxis_title="License",
yaxis_title="topic",
title="License Distribution over Topic",
autosize=True,
)
fig.update_traces(marker_color=marker_color)
fig.show()
fig = px.histogram(
df_active,
x="contributors",
nbins=100,
title=" Contributors",
)
fig.update_layout(
yaxis_title="Projects",
xaxis_title="Contributors",
)
fig.update_traces(marker_color=marker_color)
fig.show()
most_listed_projects = df_active["git_namespace"].value_counts(ascending=False).to_frame().rename_axis("Namespace").reset_index().rename(columns={"git_namespace": "counts"})
fig = go.Figure(data=[go.Table(
header=dict(values=list(most_listed_projects.columns), line_color='#000000', fill_color='#ffffff',font_size=18 ),
cells=dict(line_color='#ffffff', fill_color='#ffffff', font_size=16, height=30, values=[most_listed_projects.Namespace, most_listed_projects.counts])
)])
fig.update_layout(
autosize=False,
)
fig.show()
oldest_projects = df_active.nlargest(40, "project_age_in_years")
fig = px.bar(
oldest_projects,
x=oldest_projects["project_age_in_years"],
y=oldest_projects["project_name"],
orientation="h",
range_x=(9.6, 14),
hover_name=oldest_projects["git_url"],
hover_data=["oneliner","topic","git_namespace"],
color=oldest_projects["development_distribution_score"],
color_continuous_scale=color_continuous_scale
)
fig.update_layout(
height=1000, # Added parameter
yaxis_title="Project",
xaxis_title="Project Age in Years",
title="The oldest Projects still active",
coloraxis_colorbar=dict(
title="DDS",
),
hoverlabel=dict(
bgcolor="white"
)
)
fig.update(layout_showlegend=False)
contributors = df_active.nlargest(40, "contributors")
fig = px.bar(
contributors,
x=contributors["contributors"],
y=contributors["project_name"],
orientation="h",
title="Projects with most contributors",
hover_name=contributors["git_url"],
hover_data=["oneliner","topic","git_namespace"],
color=contributors["development_distribution_score"],
color_continuous_scale=color_continuous_scale
)
fig.update_layout(
height=1200, # Added parameter
xaxis_title="Contributors",
yaxis_title="Project",
title="Projects with the most contributors",
coloraxis_colorbar=dict(
title="DDS",
),
hoverlabel=dict(
bgcolor="white"
)
)
fig.update(layout_showlegend=False)
top_stargazers = df_active.nlargest(40, "stargazers_count")
fig = px.bar(
top_stargazers,
x=top_stargazers["stargazers_count"],
y=top_stargazers["project_name"],
orientation="h",
hover_name=top_stargazers["git_url"],
hover_data=["oneliner","topic","git_namespace"],
color=top_stargazers["development_distribution_score"],
color_continuous_scale=color_continuous_scale
)
fig.update_layout(
height=1000, # Added parameter
xaxis_title="Stars",
yaxis_title="Project",
title="Projects with the most Stars",
coloraxis_colorbar=dict(
title="DDS",
),
hoverlabel=dict(
bgcolor="white"
)
)
fig.update(layout_showlegend=False)
df_top_100_stargazers = df_active[(df_active["stargazers_count"]) > 100].copy()
df_top_100_stargazers["star_growth"] = (
df_top_100_stargazers["stars_last_year"] / df_top_100_stargazers["stargazers_count"]
)
df_top_40_star_growth = df_top_100_stargazers.nlargest(40, "star_growth")
fig = px.bar(
df_top_40_star_growth,
x=df_top_40_star_growth["star_growth"] * 100,
y=df_top_40_star_growth["project_name"],
orientation="h",
hover_name=df_top_40_star_growth["git_url"],
hover_data=["oneliner","topic","git_namespace"],
color=df_top_40_star_growth["development_distribution_score"],
color_continuous_scale=color_continuous_scale
)
fig.update_layout(
height=1000, # Added parameter
xaxis_title="Star Growth last Year [%]",
yaxis_title="Project",
title="Projects with the highest Star Growth",
hoverlabel=dict(
bgcolor="white"),
coloraxis_colorbar=dict(
title="DDS",
),
)
df_top_40_growth = df_active.nlargest(40, "total_commits_last_year")
df_top_40_growth = df_top_40_growth[df_top_40_growth["project_name"] != "ElexonDataPortal"]
fig = px.bar(
df_top_40_growth,
x=df_top_40_growth["total_commits_last_year"],
y=df_top_40_growth["project_name"],
orientation="h",
color=df_top_40_growth["development_distribution_score"],
hover_name=df_top_40_growth["git_url"],
hover_data=["oneliner","topic","git_namespace"],
color_continuous_scale=color_continuous_scale,
)
fig.update_layout(
height=1000, # Added parameter
xaxis_title="Commit Growth last Year [%]",
yaxis_title="Project",
title="Projects with the highest Commit Growth",
coloraxis_colorbar=dict(
title="DDS",
),
hoverlabel=dict(
bgcolor="white"
)
)
df_total_score = df_active.nlargest(40, "total_score")
fig = px.bar(
df_total_score,
x=df_total_score["total_score"],
y=df_total_score["project_name"],
orientation="h",
range_x=(0.85, 1),
hover_name=df_total_score["git_url"],
hover_data=["oneliner","topic","git_namespace"],
color = df_total_score["development_distribution_score"],
color_continuous_scale=color_continuous_scale
)
fig.update_layout(
height=1000, # Added parameter
xaxis_title="Total Score",
yaxis_title="Project",
title="Top Total Score",
coloraxis_colorbar=dict(
title="DDS",
),
hoverlabel=dict(
bgcolor="white"
)
)
fig.update(layout_showlegend=False)
df_activity_score = df_active.nlargest(40, "activity")
fig = px.bar(
df_activity_score,
x=df_activity_score["activity"],
y=df_activity_score["project_name"],
orientation="h",
range_x=(2.9, 3.2),
hover_name=df_activity_score["git_url"],
hover_data=["oneliner","topic","git_namespace"],
color=df_activity_score["development_distribution_score"],
color_continuous_scale=color_continuous_scale
)
fig.update_layout(
height=1000, # Added parameter
xaxis_title="Activity Score",
yaxis_title="Project",
title="Projects with the highest Activity Score",
coloraxis_colorbar=dict(
title="DDS",
),
hoverlabel=dict(
bgcolor="white"
)
)
fig.update(layout_showlegend=False)
df_size_score = df_active.nlargest(40, "size")
fig = px.bar(
df_size_score,
x=df_size_score["size"],
y=df_size_score["project_name"],
orientation="h",
range_x=(3.75, 4),
hover_name=df_size_score["git_url"],
hover_data=["oneliner","topic","git_namespace"],
color=df_size_score["development_distribution_score"],
color_continuous_scale=color_continuous_scale
)
fig.update_layout(
height=1000, # Added parameter
xaxis_title="Size Score",
yaxis_title="Project",
title="Projects with the highest Size Score",
coloraxis_colorbar=dict(
title="DDS",
),
hoverlabel=dict(
bgcolor="white"
)
)
fig.update(layout_showlegend=False)
# KK: I'd suggest selecting a few most interesting examples conveying a message and put plots with raw data in the Appendix
fig = px.scatter(
df_active.query("project_age_in_years<@max_age_in_years"),
x="project_age_in_years",
y="topic",
size="size",
color="total_score",
hover_name="git_url",
hover_data=["oneliner","topic","git_namespace"],
size_max=20,
)
fig.update_layout(
coloraxis_colorbar=dict(title="Total Score"),
height=1000, # Added parameter
xaxis_title="Project Age in Years",
yaxis_title="Topic",
title="Total Score of Projects",
hoverlabel=dict(
bgcolor="white"
)
)
fig.show()
# KK: I'd suggest selecting a few most interesting examples conveying a message and put plots with raw data in the Appendix
fig = px.scatter(
df_organization_projects.query("project_age_in_years<@max_age_in_years"),
x="project_age_in_years",
y="topic",
size="size",
color="development_distribution_score",
hover_name="git_url",
hover_data=["oneliner","topic","git_namespace"],
size_max=20,
)
fig.update_layout(
coloraxis_colorbar=dict(
title="DDS",
),
yaxis_title="Topic",
xaxis_title="Project Age in Years",
height=1000, # Added parameter
title="Development Distribution Score",
hoverlabel=dict(
bgcolor="white"
)
)
fig.show()
personal_stargazers = df_personal_projects.nlargest(40, "stargazers_count")
fig = px.bar(
personal_stargazers,
x=personal_stargazers["stargazers_count"],
y=personal_stargazers["git_namespace"],
orientation="h",
hover_name=personal_stargazers["git_url"],
hover_data=["oneliner","topic","git_namespace"],
color=personal_stargazers["development_distribution_score"],
color_continuous_scale=color_continuous_scale
)
fig.update_layout(
height=1000, # Added parameter
yaxis_title="Project",
xaxis_title="Stars",
title="Projects with most Stars in User Namespace",
coloraxis_colorbar=dict(
title="DDS",
),
hoverlabel=dict(
bgcolor="white"
)
)
fig.update(layout_showlegend=False)
# KK: can topics be grouped in fewer categories? can DDS be bucketed into categories, e.g. 0.3>=, 0.3<=&<=0.6, 0.6>=? Do we need to show all three variables, projects, DDS and dependents?
df_active["dependents_count"] = df_active["dependents_repos"].apply(count_strings)
most_dependent_projects = df_active.nlargest(50, "dependents_count")
most_dependent_projects = most_dependent_projects[most_dependent_projects["project_name"] != "Mission Support System"]
print("DDS of most used Python project:",round(most_dependent_projects["development_distribution_score"].median(),3))
fig = px.bar(
most_dependent_projects,
x=most_dependent_projects["dependents_count"],
y=most_dependent_projects["project_name"],
orientation="h",
hover_name=most_dependent_projects["git_url"],
hover_data=["oneliner","topic","git_namespace"],
color=most_dependent_projects["development_distribution_score"],
color_continuous_scale=color_continuous_scale
)
fig.update_layout(
height=1000, # Added parameter
yaxis_title="Topic",
xaxis_title="Dependents",
title="Most used Python Projects vs. DDS",
coloraxis_colorbar=dict(
title="DDS",
),
hoverlabel=dict(
bgcolor="white"
)
)
DDS of most used Python project: 0.436
Process the organizations#
df_organizations = pd.read_csv("./csv/github_organizations.csv")
df_organizations.head()
| organization_name | organization_user_name | organization_github_url | organization_website | location_city | location_country | form_of_organization | organization_avatar | organization_public_repos | organization_created | organization_last_update | rubric | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | AgroCares | https://github.com/AgroCares | https://grasplan.nl/ | NaN | Netherlands | community | https://avatars.githubusercontent.com/u/316846... | 6 | 2017-09-06 06:22 | 2021-11-16 11:18 | NaN |
| 1 | DSMR-reader | dsmrreader | https://github.com/dsmrreader | https://dsmr-reader.readthedocs.io | NaN | Netherlands | community | https://avatars.githubusercontent.com/u/577273... | 1 | 2019-11-13 19:08 | 2021-11-14 20:43 | NaN |
| 2 | STS Rosario | STS-Rosario | https://github.com/STS-Rosario | http://www.stsrosario.org.ar/index.html | NaN | Argentina | community | https://avatars.githubusercontent.com/u/244938... | 2 | 2016-12-10 14:07 | 2021-11-03 21:52 | NaN |
| 3 | Open Solar Project | opensolarproject | https://github.com/opensolarproject | NaN | NaN | Australia | for-profit | https://avatars.githubusercontent.com/u/539539... | 2 | 2019-08-09 20:31 | 2021-11-14 16:10 | NaN |
| 4 | Open Food Foundation | openfoodfoundation | https://github.com/openfoodfoundation | https://www.openfoodnetwork.org/open-food-foun... | Melbourne | Australia | non-profit | https://avatars.githubusercontent.com/u/257898... | 53 | 2012-10-17 07:53 | 2021-11-19 05:14 | NaN |
df_organizations["ISO_3"] = df_organizations["location_country"].apply(name_to_iso3)
df_organizations["ISO_3_alpha2"] = df_organizations["ISO_3"].apply(alpha3_to_alpha2)
df_organizations["continent"] = df_organizations["ISO_3_alpha2"].apply(alpha2_to_continent)
continent_his = df_organizations["continent"].value_counts().to_frame().rename_axis("continent_name")
continent_his.rename(index={"EU": "Europe", "NA": "North America", "": "Global", "OC":"Oceania", "AS":"Asia", "SA":"South America", "AF":"Africa"},inplace=True)
print(continent_his)
fig = px.pie(continent_his.reset_index(), values="continent", names="continent_name", color_discrete_sequence=color_discrete_sequence, hole=0.2)
fig.update_layout(title="Distribution of Organizations between Continents", font_size=16)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
continent
continent_name
Europe 203
North America 191
Global 176
Oceania 19
Asia 12
South America 6
Africa 4
# alternative to plotin cell 52
alt_df_organizations = df_organizations.copy()
vals_to_replace = {"EU": "Europe", "NA": "North America", "": "Global", "OC":"Other", "AS":"Other", "SA":"Other", "AF": "Other"}
alt_df_organizations['continent'] = alt_df_organizations['continent'].map(vals_to_replace)
alt_continent_his = alt_df_organizations["continent"].value_counts().to_frame().rename_axis("continent_name")
print(alt_continent_his)
alt_fig = px.pie(alt_continent_his.reset_index(), values="continent", names="continent_name", color_discrete_sequence=color_discrete_sequence, hole=0.2)
alt_fig.update_layout(title="Distribution of Organizations between Continents", font_size=16)
alt_fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
alt_fig.show()
continent
continent_name
Europe 203
North America 191
Global 176
Other 41
## https://octoverse.github.com/
values = {31.5,31.2,27.3,5.9,2.3,1.7}
index_labels=['Oceania','Africa','South America','Europe','Asia','North America']
df_users_continent_cotoverse = pd.DataFrame(values,index=index_labels).reset_index()
# similar pooling to the one in cell 53 could be done here for Africa + Oceania
fig = px.pie(df_users_continent_cotoverse, values=0, names="index", color_discrete_sequence=color_discrete_sequence, hole=0.2)
fig.update_layout(title="Distribution of Users between Continents", font_size=16)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
organization_his = (
df_organizations["form_of_organization"]
.value_counts()
.to_frame()
.rename_axis("organization")
.reset_index()
)
organization_his["organization"] = organization_his["organization"].apply(upper_string)
print(organization_his)
fig = px.pie(organization_his, values="form_of_organization", names="organization", color_discrete_sequence=color_discrete_sequence, hole=0.2)
fig.update_layout(title="Distribution of Organizational Forms", font_size=16)
fig.update_traces(textposition='outside', textinfo='percent+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
organization form_of_organization
0 Community 160
1 Academia 144
2 Government Agency 99
3 For-Profit 85
4 Non-Profit 65
5 Collaboration 58
df_countries = (
df_organizations["ISO_3"]
.value_counts()
.to_frame()
.rename_axis("country")
.reset_index()
)
df_countries = df_countries.rename(columns={"ISO_3": "counts"})
fig = px.choropleth(
df_countries,
locations="country",
locationmode="ISO-3",
color="counts",
color_continuous_scale=color_continuous_scale
)
fig.update_layout(title="Distribution of Organizational Locations Worldwide",
coloraxis_colorbar=dict(
title="Organizations",
),)
fig.show()
df_public_repos = df_organizations.nlargest(40, "organization_public_repos")
df_public_repos.head()
| organization_name | organization_user_name | organization_github_url | organization_website | location_city | location_country | form_of_organization | organization_avatar | organization_public_repos | organization_created | organization_last_update | rubric | ISO_3 | ISO_3_alpha2 | continent | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 298 | Microsoft | microsoft | https://github.com/microsoft | https://opensource.microsoft.com | Redmond, WA | USA | for-profit | https://avatars.githubusercontent.com/u/615472... | 4485 | 2013-12-10 19:06 | 2021-11-20 00:29 | NaN | USA | US | NA |
| 307 | International Business Machines | IBM | https://github.com/IBM | https://www.ibm.com/opensource/ | Armonk, NY | USA | for-profit | https://avatars.githubusercontent.com/u/145911... | 2278 | 2012-02-21 22:13 | 2021-11-19 23:15 | NaN | USA | US | NA |
| 321 | The Apache Software Foundation | apache | https://github.com/apache | https://www.apache.org/ | NaN | USA | non-profit | https://avatars.githubusercontent.com/u/47359?v=4 | 2275 | 2009-01-17 20:14 | 2021-11-20 00:34 | NaN | USA | US | NA |
| 296 | https://github.com/google | https://opensource.google/ | Mountain View, CA | USA | for-profit | https://avatars.githubusercontent.com/u/134200... | 2139 | 2012-01-18 01:30 | 2021-11-19 22:27 | NaN | USA | US | NA | ||
| 292 | Microsoft Azure | Azure | https://github.com/Azure | https://docs.microsoft.com/en-us/azure/ | Redmond, WA | USA | for-profit | https://avatars.githubusercontent.com/u/684449... | 1618 | 2014-03-03 22:17 | 2021-11-19 23:35 | NaN | USA | US | NA |
df_organizations["organizations_age_in_years"] = df_organizations["organization_created"].apply(calc_age)
fig = px.scatter(
df_organizations.query("organizations_age_in_years<@max_age_in_years"),
x="organizations_age_in_years",
y="location_country",
size="organization_public_repos",
color="form_of_organization",
hover_name="organization_website",
hover_data=["organization_name"],
size_max=20,
color_continuous_scale=color_continuous_scale,
)
fig.update_layout(
coloraxis_colorbar=dict(
title="DDS",
),
yaxis_title="topic",
xaxis_title="Project Age in Years",
height=1000, # Added parameter
title="Organizations forms within different countries",
hoverlabel=dict(
bgcolor="white"
)
)
fig.show()
Not included Projects#
Within the first version of this study we were not able to integrate a GitLab API interfaces. Also other projects on self-hosted repositories and other colloboaritve website could not be included in the study. Another group that was not included in the study are the inactive projects. Here we try to give an insight into these projects.
df_raw[(df_raw["platform"] == "gitlab")]
| project_name | oneliner | git_namespace | git_url | platform | labels | topic | last_commit_date | stargazers_count | number_of_dependents | ... | organization_github_url | organization_website | organization_location | organization_country | organization_form | organization_avatar | organization_public_repos | organization_created | organization_last_update | project_age_in_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 136 | emobpy | An open tool for creating battery-electric veh... | diw-evu/emobpy | https://gitlab.com/diw-evu/emobpy/emobpy | gitlab | NaN | Battery | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 190 | dieter_py | An open source power sector optimization model... | diw-evu/dieter_public | https://gitlab.com/diw-evu/dieter_public/dieterpy | gitlab | NaN | Energy Modeling and Optimization | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 279 | pyehub | A Python-based, modular and nestable implement... | energyincities | https://gitlab.com/energyincities/python-ehub | gitlab | NaN | Energy Distribution and Grids | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 286 | mosaik | A flexible Smart Grid co-simulation framework. | mosaik | https://gitlab.com/mosaik/mosaik | gitlab | NaN | Energy Distribution and Grids | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 287 | SmartGridToolbox | Designed to provide an extensible and flexible... | SmartGridToolbox | https://gitlab.com/SmartGridToolbox/SmartGridT... | gitlab | NaN | Energy Distribution and Grids | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 326 | KoaVTracker | Energy targets in the coalition agreement of t... | diw-evu | https://gitlab.com/diw-evu/koavtracker | gitlab | NaN | Datasets on Energy Systems | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 359 | Energy Signature Analyser | A toolbox to analyze energy signatures of buil... | energyincities | https://gitlab.com/energyincities/energy-signa... | gitlab | NaN | Buildings and Heating | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 367 | BESOS | A collection of modules for the simulation and... | energyincities | https://gitlab.com/energyincities/besos | gitlab | NaN | Buildings and Heating | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 384 | Macquette | A whole house energy assessment tool, which mo... | retrofitcoop | https://gitlab.com/retrofitcoop/macquette | gitlab | NaN | Buildings and Heating | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 429 | sustainable-mobility-api | Consists of a Python library and HTTP API for ... | mshepherd | https://gitlab.com/mshepherd/sustainable-mobil... | gitlab | NaN | Mobility and Transportation | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 462 | H2020 CATALYST | Converting data centres in energy flexibility ... | NaN | https://gitlab.com/project-catalyst | gitlab | NaN | Computation and Communication | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 469 | ecometer | Loads websites, compute metrics (from network ... | ecoconceptionweb | https://gitlab.com/ecoconceptionweb/ecometer | gitlab | NaN | Computation and Communication | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 478 | Carbon-API-2.0 | Estimating the carbon emissions per page on th... | wholegrain | https://gitlab.com/wholegrain/carbon-api-2-0 | gitlab | NaN | Computation and Communication | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 504 | CarbonFootprint | A browser extension that displays carbon footp... | aossie | https://gitlab.com/aossie/CarbonFootprint | gitlab | NaN | Carbon Intensity and Accounting | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 519 | OpenIAM | An open source integrated assessment model dev... | NRAP | https://gitlab.com/NRAP/OpenIAM | gitlab | NaN | Carbon Capture and Removel | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 530 | vein | An R package to estimate Vehicular Emissions I... | ibarraespinosa | https://gitlab.com/ibarraespinosa/vein | gitlab | NaN | Emission Observation and Modeling | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 623 | OpenSimRoot | Source code for simulating root architecture, ... | rootmodels | https://gitlab.com/rootmodels/OpenSimRoot | gitlab | NaN | Biosphere | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 684 | EU forest tree point data | A compilation of analysis-ready point data for... | openlandmap | https://gitlab.com/openlandmap/eu-forest-tree-... | gitlab | NaN | Biosphere | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 723 | GlaThiDa | Glacier Thickness Database. | wgms | https://gitlab.com/wgms/glathida | gitlab | NaN | Cryosphere | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 820 | met.3D | Interactive three-dimensional visualization of... | wxmetvis | https://gitlab.com/wxmetvis/met.3d | gitlab | NaN | Atmosphere | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 838 | The Global Environmental Multiscale Model | An integrated forecasting and data assimilatio... | eccc/gem | https://gitlab.com/eccc/gem/gem | gitlab | NaN | Earth and Climate Modeling | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1014 | Imod-Python | Designed to help you in your MODFLOW groundwat... | deltares/imod | https://gitlab.com/deltares/imod/imod-python | gitlab | NaN | Water Supply | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1026 | RTC-Tools | A toolbox for control and optimization of wate... | deltares | https://gitlab.com/deltares/rtc-tools | gitlab | NaN | Water Supply | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1057 | WWTP | EU28 Waste Water Treatment Plants. | hotmaps/potential | https://gitlab.com/hotmaps/potential/WWTP | gitlab | NaN | Water Supply | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1106 | OpenLandMap | Data, services and web-apps providing access a... | NaN | https://gitlab.com/openlandmap | gitlab | NaN | Soil and Land | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1200 | XDC Model | Enable users, or any intereted subject, to und... | xdc-model | https://gitlab.com/xdc-model/xdc | gitlab | NaN | Sustainable Investment | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1263 | Eumap | Comprises environmental, land cover, terrain, ... | geoharmonizer_inea | https://gitlab.com/geoharmonizer_inea/eumap | gitlab | NaN | Data Catalogs and Interfaces | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
27 rows × 52 columns
df_inactive = df_raw[(df_raw["project_active"] == False)].copy()
# Age plots are better in years
df_inactive["project_age_in_years"] = df_inactive["project_age_in_days"].apply(lambda x: x / 365)
fig = px.scatter(
df_inactive,
x="project_age_in_years",
y="topic",
size="contributors",
color="development_distribution_score",
hover_name="git_url",
size_max=20,
color_continuous_scale=color_continuous_scale.reverse(),
)
fig.update_layout(
coloraxis_colorbar=dict(
title="DDS",
),
paper_bgcolor="lightgray",
height=1000, # Added parameter
yaxis_title="Topic",
xaxis_title="Project Age in years",
title="Development Distribution Score within inactive Projects",
hoverlabel=dict(
bgcolor="white"
)
)
fig.show()